<?php
/**
**采集新浪博客内容
*/
header('Content-Type:text/html;charset=utf-8');
require_once('Snoopy/Snoopy.class.php');

 $Snoop = new Snoopy();

if( file_exists('sina.txt') && filemtime('sina.txt')+1800 > time()){ //判断此文件是否为半小时更新的,如果超过半小时,更新文件
    $content = file_get_contents('sina.txt');
}else{
  $URL = 'http://blog.sina.com.cn/s/articlelist_1197161814_0_1.html';
  $Snoop->fetch($URL);
  $content = $Snoop->results;
  file_put_contents( 'sina.txt' , $content );  //缓存列表页到sina.txt中
}

//echo '<pre>';

$preg = '/<div class="articleList">([\w\W]*?)<div class="SG_page">/isU';

preg_match($preg, $content,$match);

$content = $match[0];
$preg =  '/<span class="atc_title">([\w\W]*?)<span class="atc_ic_b">[\w\W]*?<span class="atc_tm SG_txtc">(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2})/isu';
preg_match_all($preg, $content,$match);

$list = array();
$content = $match;
$preg = '/<a title="([\w\W]*?)" target="_blank" href="([\w\W]*?)">/isu';
foreach($content[1] as $key=>$value){
	preg_match($preg, $value,$match);
	$list[] = array($match[1],$match[2],$content[2][$key]);
}

//print_r($list);

//开始采集详细内容
$item = $_GET['item']?$_GET['item']:0;
echo '开始采集第'.$item.'条<br>';
echo $list[$item][0].'<br/>';
if(empty($list[$item][1])){
	die('采集完成');
}

 // $URL = $list[$item][1];
  $URL = 'http://blog.sina.com.cn/s/blog_43ddab760100cw8s.html';
  $Snoop = new Snoopy();
  $Snoop->fetch($URL);
  $content = $Snoop->results;
  $preg = '#"articalContent">([\w\W]*?)</div>[\W\s]*?<!-- 正文结束 -->#';
  preg_match($preg, $content,$match);
  $content = $match[1];  //得到文章的内容
  
 // $content = str_ireplace('img src','img real2_src',$content);
 // $content = str_ireplace('real_src','src',$content);  

  $imgarr = $Snoop->_stripimgs($content);
  foreach($imgarr as $img){
	  //保存图片到本地
	  $locaimg = time().'jpg';
      file_put_contents($locaimg,file_get_contents($img));
	  //替换图片地址
     $content = preg_replace('#http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif#',$locaimg,$content,1);
  }

  print_r($content);
  $item++;
?>
<SCRIPT LANGUAGE="JavaScript">
<!--
	//window.location.href='?item=<?php echo $item?>';
//-->
</SCRIPT>